{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.chdir('/home/zola/Projects/temp/KBQA/src')\n",
    "from setup import IndexSearch, Mongo_Connector\n",
    "\n",
    "e_index = IndexSearch('dbpedia201604e')\n",
    "p_index = IndexSearch('dbpedia201604p')\n",
    "\n",
    "import json\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### LC-QUAD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4998 docs\n",
      "{'1hop': [['http://dbpedia.org/resource/Bill_Finger',\n",
      "           'http://dbpedia.org/ontology/ComicsCharacter'],\n",
      "          ['http://dbpedia.org/ontology/creator',\n",
      "           'http://www.w3.org/1999/02/22-rdf-syntax-ns#type']],\n",
      " '1hop_ids': [[5516345, 2979916], [362, 68655]],\n",
      " '1hop_spans': [['bill finger', 'comic characters'], ['painted by']],\n",
      " '2hop': [[], []],\n",
      " '2hop_ids': [[], []],\n",
      " '2hop_spans': [[], []],\n",
      " 'SerialNumber': '1',\n",
      " '_id': ObjectId('5c3501d00666955066857ae2'),\n",
      " 'answers': ['http://dbpedia.org/resource/Batman',\n",
      "             'http://dbpedia.org/resource/Alfred_Pennyworth',\n",
      "             'http://dbpedia.org/resource/Apache_Chief',\n",
      "             'http://dbpedia.org/resource/James_Gordon_(comics)',\n",
      "             'http://dbpedia.org/resource/Hugo_Strange',\n",
      "             'http://dbpedia.org/resource/Wildcat_(comics)',\n",
      "             'http://dbpedia.org/resource/Bat-Mite',\n",
      "             'http://dbpedia.org/resource/Vicki_Vale',\n",
      "             'http://dbpedia.org/resource/Squire_(comics)',\n",
      "             'http://dbpedia.org/resource/Crime_Doctor_(comics)',\n",
      "             'http://dbpedia.org/resource/Knight_(comics)',\n",
      "             'http://dbpedia.org/resource/Signalman_(comics)',\n",
      "             'http://dbpedia.org/resource/Lana_Lang',\n",
      "             'http://dbpedia.org/resource/Penguin_(comics)',\n",
      "             'http://dbpedia.org/resource/Riddler',\n",
      "             'http://dbpedia.org/resource/Scarecrow_(DC_Comics)',\n",
      "             'http://dbpedia.org/resource/Dick_Grayson',\n",
      "             'http://dbpedia.org/resource/Lori_Lemaris',\n",
      "             'http://dbpedia.org/resource/Two-Face',\n",
      "             'http://dbpedia.org/resource/Clayface',\n",
      "             'http://dbpedia.org/resource/Super-Sons',\n",
      "             'http://dbpedia.org/resource/Mad_Hatter_(comics)',\n",
      "             'http://dbpedia.org/resource/Firefly_(DC_Comics)',\n",
      "             'http://dbpedia.org/resource/Killer_Moth',\n",
      "             'http://dbpedia.org/resource/Catman_(comics)',\n",
      "             'http://dbpedia.org/resource/Doiby_Dickles',\n",
      "             'http://dbpedia.org/resource/Martha_Wayne',\n",
      "             'http://dbpedia.org/resource/Ace_the_Bat-Hound',\n",
      "             'http://dbpedia.org/resource/Calendar_Man',\n",
      "             'http://dbpedia.org/resource/Alan_Scott',\n",
      "             'http://dbpedia.org/resource/Tony_Zucco',\n",
      "             'http://dbpedia.org/resource/Isbisa',\n",
      "             'http://dbpedia.org/resource/Batman_(Dark_Knight_Universe)',\n",
      "             'http://dbpedia.org/resource/Tiger_Shark_(DC_Comics)',\n",
      "             'http://dbpedia.org/resource/Joker_(comics)',\n",
      "             'http://dbpedia.org/resource/Thomas_Wayne',\n",
      "             'http://dbpedia.org/resource/Robin_(Earth-Two)',\n",
      "             'http://dbpedia.org/resource/Lew_Moxon',\n",
      "             'http://dbpedia.org/resource/Batman_(Terry_McGinnis)',\n",
      "             'http://dbpedia.org/resource/Joe_Chill',\n",
      "             'http://dbpedia.org/resource/Professor_Milo',\n",
      "             'http://dbpedia.org/resource/Kite_Man',\n",
      "             'http://dbpedia.org/resource/Zebra-Man',\n",
      "             'http://dbpedia.org/resource/Bette_Kane',\n",
      "             'http://dbpedia.org/resource/Batman_(Earth-Two)',\n",
      "             'http://dbpedia.org/resource/Sal_Maroni'],\n",
      " 'answers_ids': [5248779,\n",
      "                 4164151,\n",
      "                 4620349,\n",
      "                 13091507,\n",
      "                 12436800,\n",
      "                 23909243,\n",
      "                 5242458,\n",
      "                 23314400,\n",
      "                 20929173,\n",
      "                 8450386,\n",
      "                 14247619,\n",
      "                 20536971,\n",
      "                 14560937,\n",
      "                 18175910,\n",
      "                 19363274,\n",
      "                 20129584,\n",
      "                 9057727,\n",
      "                 15268137,\n",
      "                 22849128,\n",
      "                 8068063,\n",
      "                 21290182,\n",
      "                 15533836,\n",
      "                 10659084,\n",
      "                 14160974,\n",
      "                 7562161,\n",
      "                 9202607,\n",
      "                 15873650,\n",
      "                 3760930,\n",
      "                 6144287,\n",
      "                 4025516,\n",
      "                 22612661,\n",
      "                 12879671,\n",
      "                 5249181,\n",
      "                 22465651,\n",
      "                 13559159,\n",
      "                 22417130,\n",
      "                 19482439,\n",
      "                 14816637,\n",
      "                 5249199,\n",
      "                 13386881,\n",
      "                 18737298,\n",
      "                 14224699,\n",
      "                 24375813,\n",
      "                 5456603,\n",
      "                 5249183,\n",
      "                 19921099],\n",
      " 'c1_spans': ['comic characters'],\n",
      " 'c2_spans': [],\n",
      " 'checked': 'true',\n",
      " 'classes': ['http://dbpedia.org/ontology/ComicsCharacter'],\n",
      " 'classes_ids': [2979916],\n",
      " 'complex_bool': 0,\n",
      " 'e1_spans': ['bill finger'],\n",
      " 'entity mapping': [{'label': 'Bill Finger',\n",
      "                     'matchedBy': 'spotlight',\n",
      "                     'seq': '38,49',\n",
      "                     'uri': 'http://dbpedia.org/resource/Bill_Finger'}],\n",
      " 'entity_ids': [2979916, 5516345],\n",
      " 'entity_uris': ['http://dbpedia.org/ontology/ComicsCharacter',\n",
      "                 'http://dbpedia.org/resource/Bill_Finger'],\n",
      " 'id': 'f0a9f1ca14764095ae089b152e0e7f12',\n",
      " 'p1_spans': ['painted by'],\n",
      " 'p2_spans': [],\n",
      " 'predicate mapping': [{'label': 'painted by',\n",
      "                        'mappedBy': 'manual corrections',\n",
      "                        'seq': '27,37',\n",
      "                        'uri': 'http://dbpedia.org/ontology/creator'},\n",
      "                       {'label': 'comic characters',\n",
      "                        'mappedBy': 'manual corrections',\n",
      "                        'seq': '6,22',\n",
      "                        'uri': 'http://dbpedia.org/ontology/ComicsCharacter'}],\n",
      " 'predicate_ids': [362, 68655],\n",
      " 'predicate_uris': ['http://dbpedia.org/ontology/creator',\n",
      "                    'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'],\n",
      " 'q_parse': ['which',\n",
      "             'comic',\n",
      "             'characters',\n",
      "             'are',\n",
      "             'painted',\n",
      "             'by',\n",
      "             'bill',\n",
      "             'finger'],\n",
      " 'question': 'Which comic characters are painted by Bill Finger?',\n",
      " 'question_type': 'SELECT',\n",
      " 's_parse': ['', 'E1', 'E1', '', 'P1', 'P1', 'E1', 'E1'],\n",
      " 'sparql_id': '301',\n",
      " 'sparql_query': 'SELECT DISTINCT ?uri WHERE {?uri '\n",
      "                 '<http://dbpedia.org/ontology/creator> '\n",
      "                 '<http://dbpedia.org/resource/Bill_Finger>  . ?uri '\n",
      "                 '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> '\n",
      "                 '<http://dbpedia.org/ontology/ComicsCharacter>}',\n",
      " 'train': True}\n"
     ]
    }
   ],
   "source": [
    "mongo = Mongo_Connector('kbqa', 'lcquad')\n",
    "\n",
    "# load LC-QUAD dataset\n",
    "loaded = True\n",
    "\n",
    "import os\n",
    "os.chdir(\"/home/zola/Projects/temp/KBQA/data/lcquad\")\n",
    "import pprint\n",
    "lcquad_path = \"lcquad_answers.json\"\n",
    "if not loaded:\n",
    "    mongo.load_json(lcquad_path)\n",
    "mongo.count_all_docs()\n",
    "doc = mongo.get_sample(limit=1)[0]\n",
    "pprint.pprint(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "# define random test/train splits\n",
    "loaded = True\n",
    "limit = None\n",
    "\n",
    "if not loaded:\n",
    "    test_size = 0.2\n",
    "    with open(lcquad_path, \"r\") as json_file:\n",
    "        docs = json.load(json_file)\n",
    "        X, question_types = [], []\n",
    "        for doc in docs:\n",
    "            X.append(doc['SerialNumber'])\n",
    "            question_types.append(doc['question_type'])\n",
    "        print(\"%d docs loaded\"%len(docs))\n",
    "\n",
    "    from sklearn.model_selection import train_test_split\n",
    "    # fix random seed\n",
    "    X_train, X_test = train_test_split(X, test_size=test_size,\n",
    "                                       stratify=question_types, random_state=103232)\n",
    "    print(\"%d train samples and %d test samples\" % (len(X_train), len(X_test)))\n",
    "\n",
    "    # annotate\n",
    "    samples = mongo.get_sample(limit=limit)\n",
    "    for doc in samples:\n",
    "        if doc['SerialNumber'] in X_train:\n",
    "            doc['train'] = True\n",
    "        else:\n",
    "            doc['train'] = False\n",
    "        mongo.col.update_one({'_id': doc['_id']}, {\"$set\": doc}, upsert=True)\n",
    "\n",
    "sample = mongo.get_sample(limit=1)[100]\n",
    "print(sample['train'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "# reproduce the original lcquad test/train splits\n",
    "loaded = True\n",
    "\n",
    "limit = None\n",
    "lcquad_train_path = 'train-data.json'  # wget https://raw.githubusercontent.com/AskNowQA/LC-QuAD/data/train-data.json\n",
    "\n",
    "if not loaded:\n",
    "    X_train = []\n",
    "   \n",
    "    with open(lcquad_train_path, \"r\") as json_file:\n",
    "        docs = json.load(json_file)\n",
    "        for doc in docs:\n",
    "            X_train.append(str(int(doc['_id']) + 1))\n",
    "        print(\"%d docs loaded\"%len(docs))\n",
    "\n",
    "    print(\"%d train samples\" % (len(X_train)))\n",
    "    # annotate\n",
    "    samples = mongo.get_sample(train=False, limit=limit)\n",
    "    for doc in samples:\n",
    "        if doc['SerialNumber'] in X_train:\n",
    "            doc['train'] = True\n",
    "        else:\n",
    "            doc['train'] = False\n",
    "        mongo.col.update_one({'_id': doc['_id']}, {\"$set\": doc}, upsert=True)\n",
    "\n",
    "sample = mongo.get_sample(limit=1)[100]\n",
    "print(sample['train'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000 documents annotated with entities and predicates URIs across hops\n",
      "[['http://dbpedia.org/resource/Gestapo'], ['http://dbpedia.org/ontology/parentOrganisation']]\n",
      "[[], ['http://dbpedia.org/ontology/leader']]\n",
      "['http://dbpedia.org/resource/Gestapo']\n",
      "['http://dbpedia.org/ontology/leader', 'http://dbpedia.org/ontology/parentOrganisation']\n"
     ]
    }
   ],
   "source": [
    "# annotate GS entities and predicates across hops\n",
    "# parse entities and predicates annotations across hops from the SQL query and update MongoDB collection\n",
    "loaded = False\n",
    "\n",
    "verbose = False\n",
    "limit = None\n",
    "\n",
    "if not loaded:\n",
    "    samples = mongo.get_sample(train=False, limit=limit)\n",
    "    count = 0\n",
    "    for doc in samples:\n",
    "        # fix URI !\n",
    "        sparql_query = doc['sparql_query']\n",
    "        # parse the SPARQL query into spo triples\n",
    "        tripples = sparql_query[sparql_query.find(\"{\")+1:sparql_query.find(\"}\")].split('. ')\n",
    "\n",
    "        # collect entities and predicates separately for 2 hops\n",
    "        correct_intermediate_predicates = []\n",
    "        correct_intermediate_entities = []\n",
    "        correct_question_predicates = []\n",
    "        correct_question_entities = []\n",
    "\n",
    "        for tripple in tripples:\n",
    "            if tripple:\n",
    "                entities = []\n",
    "                s, p, o = tripple.strip().split()\n",
    "                if s[0] != '?':\n",
    "                    entities.append(s[1:-1])\n",
    "                if o[0] != '?':\n",
    "                    entities.append(o[1:-1])\n",
    "                p = p[1:-1]\n",
    "\n",
    "                if '?uri' not in tripple:\n",
    "                    correct_intermediate_predicates.append(p)\n",
    "                    correct_intermediate_entities.extend(entities)\n",
    "                else:\n",
    "                    correct_question_predicates.append(p)\n",
    "                    correct_question_entities.extend(entities)\n",
    "        if verbose:\n",
    "            print('\\n')\n",
    "            print(sparql_query)\n",
    "            print(correct_intermediate_entities, correct_intermediate_predicates)\n",
    "            print(correct_question_entities, correct_question_predicates)\n",
    "\n",
    "        # update question annotations save in MongoDB\n",
    "        if not correct_intermediate_predicates:\n",
    "            # 1 hop\n",
    "            doc['1hop'] = [correct_question_entities, correct_question_predicates]\n",
    "            doc['2hop'] = [[], []]\n",
    "        else:\n",
    "            # 2 hops\n",
    "            doc['1hop'] = [correct_intermediate_entities, correct_intermediate_predicates]\n",
    "            doc['2hop'] = [correct_question_entities, correct_question_predicates]\n",
    "        \n",
    "        doc['entity_uris'] = list(set(correct_question_entities+correct_intermediate_entities))\n",
    "        # store all predicate URIs for subgraph extraction\n",
    "        doc['predicate_uris'] = list(set(correct_question_predicates+correct_intermediate_predicates))\n",
    "        \n",
    "        mongo.col.update_one({'_id': doc['_id']}, {\"$set\": doc}, upsert=True)\n",
    "        count +=1\n",
    "\n",
    "    print(\"%d documents annotated with entities and predicates URIs across hops\"%count)\n",
    "\n",
    "# show sample annotation\n",
    "sample = mongo.get_by_id(\"3\")\n",
    "print(sample['1hop'])\n",
    "print(sample['2hop'])\n",
    "print(sample['entity_uris'])\n",
    "print(sample['predicate_uris'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n"
     ]
    }
   ],
   "source": [
    "# define random test/train split\n",
    "loaded = True\n",
    "limit = None\n",
    "\n",
    "if not loaded:\n",
    "    test_size = 0.2\n",
    "    with open(lcquad_path, \"r\") as json_file:\n",
    "        docs = json.load(json_file)\n",
    "        X, question_types = [], []\n",
    "        for doc in docs:\n",
    "            X.append(doc['SerialNumber'])\n",
    "            question_types.append(doc['question_type'])\n",
    "        print(\"%d docs loaded\"%len(docs))\n",
    "\n",
    "    from sklearn.model_selection import train_test_split\n",
    "    # fix random seed\n",
    "    X_train, X_test = train_test_split(X, test_size=test_size,\n",
    "                                       stratify=question_types, random_state=103232)\n",
    "    print(\"%d train samples and %d test samples\" % (len(X_train), len(X_test)))\n",
    "\n",
    "    # annotate\n",
    "    samples = mongo.get_sample(limit=limit)\n",
    "    for doc in samples:\n",
    "        if doc['SerialNumber'] in X_train:\n",
    "            doc['train'] = True\n",
    "        else:\n",
    "            doc['train'] = False\n",
    "        mongo.col.update_one({'_id': doc['_id']}, {\"$set\": doc}, upsert=True)\n",
    "\n",
    "sample = mongo.get_sample(limit=1)[100]\n",
    "print(sample['train'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000 documents annotated with ids\n",
      "1000 documents annotated with ids\n",
      "[[20271581, 24029645], [1076]]\n",
      "[[], []]\n"
     ]
    }
   ],
   "source": [
    "# store HDT IDs across hops\n",
    "limit = None\n",
    "\n",
    "def annotate_hop_ids(hop):\n",
    "    samples = mongo.get_sample(train=False, limit=limit)\n",
    "    count = 0\n",
    "    for doc in samples:\n",
    "        e, p = doc[hop]\n",
    "        e_ids = []\n",
    "        for uri in e:\n",
    "            try:\n",
    "                e_ids.append(e_index.look_up_by_uri(uri)[0]['_source']['id'])\n",
    "            except:\n",
    "                print(\"%s not found in the entity catalog\"%uri)\n",
    "\n",
    "        p_ids = []\n",
    "        for uri in p:\n",
    "            try:\n",
    "                p_ids.append(p_index.look_up_by_uri(uri)[0]['_source']['id'])\n",
    "            except:\n",
    "                print(\"%s not found in the predicate catalog\"%uri)\n",
    "        \n",
    "        doc[hop+'_ids'] = e_ids, p_ids\n",
    "            \n",
    "        # update doc in MongoDB\n",
    "        mongo.col.update_one({'_id': doc['_id']}, {\"$set\": doc}, upsert=True)\n",
    "        count += 1\n",
    "        \n",
    "    print(\"%d documents annotated with ids\"%count)\n",
    "\n",
    "\n",
    "annotate_hop_ids('1hop')\n",
    "annotate_hop_ids('2hop')\n",
    "\n",
    "# show sample annotations\n",
    "sample = mongo.get_sample(limit=1)[0]\n",
    "print(sample['1hop_ids'])\n",
    "print(sample['2hop_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000 documents annotated with ids\n",
      "[24029645, 20271581]\n",
      "[1076]\n"
     ]
    }
   ],
   "source": [
    "# store HDT IDs for questions and answers URIs\n",
    "loaded = False\n",
    "limit = None\n",
    "\n",
    "if not loaded:\n",
    "    samples = mongo.get_sample(train=False, limit=limit)\n",
    "    count = 0\n",
    "    for doc in samples:\n",
    "        # get all correct entity and predicate from the GS annotations\n",
    "        e_ids = []\n",
    "        for uri in doc['entity_uris']:\n",
    "            try:\n",
    "                e_ids.append(e_index.look_up_by_uri(uri)[0]['_source']['id'])\n",
    "            except:\n",
    "                print(\"%s not found in the entity catalog\"%uri)\n",
    "        doc['entity_ids'] = e_ids\n",
    "\n",
    "        p_ids = []\n",
    "        for uri in doc['predicate_uris']:\n",
    "            try:\n",
    "                p_ids.append(p_index.look_up_by_uri(uri)[0]['_source']['id'])\n",
    "            except:\n",
    "                print(\"%s not found in the predicate catalog\"%uri)\n",
    "        \n",
    "        doc['predicate_ids'] = p_ids\n",
    "        \n",
    "        if 'answers' in doc:\n",
    "            a_ids = []\n",
    "            for uri in doc['answers']:\n",
    "                try:\n",
    "                    a_ids.append(e_index.look_up_by_uri(uri)[0]['_source']['id'])\n",
    "                except:\n",
    "                    print(\"%s not found in the entity catalog\"%uri)\n",
    "\n",
    "            doc['answers_ids'] = a_ids\n",
    "            \n",
    "        # update doc in MongoDB\n",
    "        mongo.col.update_one({'_id': doc['_id']}, {\"$set\": doc}, upsert=True)\n",
    "        count += 1\n",
    "        \n",
    "    print(\"%d documents annotated with ids\"%count)\n",
    "\n",
    "# show sample annotations\n",
    "sample = mongo.get_sample(limit=1)[0]\n",
    "print(sample['entity_ids'])\n",
    "print(sample['predicate_ids'])\n",
    "if 'answers' in sample:\n",
    "    print(sample['answers_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1982 complex questions (with more than one variable)\n",
      "1599 complex questions (with more than one variable)\n",
      "383 complex questions (with more than one variable)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zola/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel_launcher.py:4: DeprecationWarning: count is deprecated. Use Collection.count_documents instead.\n",
      "  after removing the cwd from sys.path.\n",
      "/home/zola/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel_launcher.py:5: DeprecationWarning: count is deprecated. Use Collection.count_documents instead.\n",
      "  \"\"\"\n",
      "/home/zola/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel_launcher.py:6: DeprecationWarning: count is deprecated. Use Collection.count_documents instead.\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "# dataset stats\n",
    "predicate_ids\n",
    "# number of documents with 2 hops\n",
    "print(\"%d complex questions (with more than one variable)\"%mongo.col.find({\"2hop\": { \"$ne\": [[], []] }}).count())\n",
    "print(\"%d complex questions (with more than one variable)\"%mongo.col.find({\"train\": True, \"2hop\": { \"$ne\": [[], []] }}).count())\n",
    "print(\"%d complex questions (with more than one variable)\"%mongo.col.find({\"train\": False, \"2hop\": { \"$ne\": [[], []] }}).count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3131\n"
     ]
    }
   ],
   "source": [
    "# number of documents with >1 triple\n",
    "limit = None\n",
    "samples = mongo.get_sample(train=True, limit=limit)\n",
    "counter = 0\n",
    "for doc in samples:\n",
    "    n_components = len(doc['predicate_ids']) + len(doc['entity_ids'])\n",
    "    if n_components > 2:\n",
    "        counter += 1\n",
    "print(counter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Min:3 Avg:11 Max:22\n"
     ]
    }
   ],
   "source": [
    "# question lengths distribution\n",
    "limit = None\n",
    "from keras.preprocessing.text import text_to_word_sequence\n",
    "import numpy as np\n",
    "samples = mongo.get_sample(train=False, limit=limit)\n",
    "n_words_distr = []\n",
    "for doc in samples:\n",
    "    words = text_to_word_sequence(doc['question'])\n",
    "    # add the sample to the dataset \n",
    "    n_words_distr.append(len(words))\n",
    "\n",
    "# show basic stats\n",
    "min_len = min(n_words_distr)\n",
    "mean_len = np.mean(n_words_distr)\n",
    "max_len = max(n_words_distr)\n",
    "print(\"Min:%d Avg:%d Max:%d\"%(min_len, mean_len, max_len))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Min:2 Avg:3 Max:6\n"
     ]
    }
   ],
   "source": [
    "# question number of question URIs distribution (predicate + entity)\n",
    "from keras.preprocessing.text import text_to_word_sequence\n",
    "import numpy as np\n",
    "samples = mongo.get_sample(train=False, limit=limit)\n",
    "n_distr = []\n",
    "for doc in samples:\n",
    "    n_distr.append(len(doc['entity_uris']+doc['predicate_uris']))\n",
    "\n",
    "# show basic stats\n",
    "min_len = min(n_distr)\n",
    "mean_len = np.mean(n_distr)\n",
    "max_len = max(n_distr)\n",
    "print(\"Min:%d Avg:%d Max:%d\"%(min_len, mean_len, max_len))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Min:1 Median:1 Max:8195\n"
     ]
    }
   ],
   "source": [
    "# number of answers per question\n",
    "import numpy as np\n",
    "\n",
    "samples = mongo.get_sample(train=False, limit=None)\n",
    "n_distr = []\n",
    "for doc in samples:\n",
    "    n_distr.append(len(doc['answers']) if 'answers' in doc else 1)\n",
    "\n",
    "# show basic stats\n",
    "min_len = min(n_distr)\n",
    "mean_len = np.median(n_distr)\n",
    "max_len = max(n_distr)\n",
    "print(\"Min:%d Median:%d Max:%d\"%(min_len, mean_len, max_len))\n",
    "p_distribution = Counter(n_distr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('http://dbpedia.org/ontology/religion', 64), ('http://dbpedia.org/ontology/award', 64), ('http://dbpedia.org/ontology/birthPlace', 52), ('http://dbpedia.org/property/awards', 50), ('http://dbpedia.org/ontology/deathPlace', 44), ('http://dbpedia.org/ontology/team', 44), ('http://dbpedia.org/ontology/country', 42), ('http://dbpedia.org/ontology/sport', 42), ('http://dbpedia.org/ontology/almaMater', 41), ('http://dbpedia.org/property/owner', 41), ('http://dbpedia.org/ontology/manufacturer', 41), ('http://dbpedia.org/property/placeOfBirth', 40), ('http://dbpedia.org/ontology/location', 39), ('http://dbpedia.org/ontology/occupation', 38), ('http://dbpedia.org/property/writer', 35), ('http://dbpedia.org/property/distributor', 35), ('http://dbpedia.org/property/birthPlace', 34), ('http://dbpedia.org/ontology/author', 34), ('http://dbpedia.org/ontology/knownFor', 34), ('http://dbpedia.org/ontology/producer', 33), ('http://dbpedia.org/ontology/formerTeam', 33), ('http://dbpedia.org/property/headquarters', 33), ('http://dbpedia.org/property/almaMater', 32), ('http://dbpedia.org/property/mascot', 31), ('http://dbpedia.org/property/successor', 31), ('http://dbpedia.org/property/starring', 31), ('http://dbpedia.org/property/label', 30), ('http://dbpedia.org/ontology/developer', 29), ('http://dbpedia.org/property/themeMusicComposer', 29), ('http://dbpedia.org/property/children', 29)]\n",
      "592 predicates\n"
     ]
    }
   ],
   "source": [
    "# number of unique predicates and distribution \n",
    "limit = None\n",
    "\n",
    "# training set\n",
    "samples = mongo.get_sample(train=True, limit=limit)\n",
    "predicates = []\n",
    "for doc in samples:\n",
    "    predicates.extend(doc['predicate_uris'])\n",
    "\n",
    "# test set\n",
    "# samples = mongo.get_sample(train=False, limit=limit)\n",
    "# for doc in samples:\n",
    "#     predicates.extend(doc['predicate_uris'])    \n",
    "\n",
    "\n",
    "p_distribution = Counter([p for p in predicates if p != 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'])\n",
    "# most common\n",
    "print (p_distribution.most_common(30))\n",
    "\n",
    "predicates = list(set(predicates))\n",
    "print(\"%d predicates\"%len(predicates))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* 5 predicates not seen during training\n",
    "* most frequent predicate: type ('http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 1568)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEBCAYAAACXArmGAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3Xt40/d99/+nDpbPJwkfZAwmnBUSSEcgd7qmaYjBtDGFtQU2d93uLSHXb+HOuuzq1qxdIU7SbHTXva1Nsjt3szVrRn9rfjTLAcKAcqU5NiEkJARiMAkYDLF8ki18kC3r8P39YaxgwLZsZEm2X4/r4gL5+5H8lmz00ufw/XxNhmEYiIiIAOZEFyAiIslDoSAiIhEKBRERiVAoiIhIhEJBREQiFAoiIhKhUBARkQiFgoiIRCgUREQkQqEgIiIRCgUREYlQKIiISIRCQUREIqyJLiBa7e3dhMOJ39DV4cjC4+lKdBnDUo1XL9nrA9UYC8leH4y9RrPZRH5+5qjvN2FCIRw2kiIUgKSpYziq8eole32gGmMh2euD+Nao4SMREYlQKIiISIRCQUREIhQKIiISoVAQEZEIhYKIiEQoFEREJGLCnKeQrIJh8AeCw7ZJTbFiVfyKyASgULhK/kCQg8eahm2zzFWENVUvtYgkP31+FRGRCIWCiIhEKBRERCRCoSAiIhEKBRERiVAoiIhIhEJBREQiFAoiIhKhUBARkQiFgoiIRCgUREQkQqEgIiIRCgUREYlQKIiISIRCQUREIhQKIiISoVAQEZEIhYKIiEQoFEREJEKhICIiEQoFERGJUCiIiEiEQkFERCIUCiIiEqFQEBGRCIWCiIhERBUKdXV1bNy4kYqKCjZu3Mjp06cvaxMKhaiurqa8vJyVK1eyY8eOyDGPx8Pdd9/NmjVrWL16NQ888ADBYDBmT0JERGIjqlDYunUrVVVV7N27l6qqKrZs2XJZm507d1JfX8++fft45plnePTRRzl37hwATzzxBHPmzGHnzp3s3LmTjz76iH379sX2mYiIyFUbMRQ8Hg81NTVUVlYCUFlZSU1NDW1tbYPa7d69m/Xr12M2m7Hb7ZSXl7Nnzx4ATCYT3d3dhMNh+vr6CAQCFBUVjcPTERGRqzFiKLjdboqKirBYLABYLBYKCwtxu92XtSspKYncdjqdNDY2AnDPPfdQV1fHF77whcifpUuXxvJ5iIhIDFjj8U327NnDggUL+PnPf053dzebNm1iz549rF69OurHcDiyxrHC0SkoyI7822jzkZ2VNmz7jIxUCuwZ413WIBfXmKySvcZkrw9UYywke30Q3xpHDAWn00lTUxOhUAiLxUIoFKK5uRmn03lZu4aGBhYvXgwM7jls376dRx55BLPZTHZ2NitWrODAgQOjCgWPp4tw2BjNcxsXBQXZtLR0Rm77/EE6u3qHvY/P56clFBrv0iIurTEZJXuNyV4fqMZYSPb6YOw1ms2mMX2YHnH4yOFw4HK52LVrFwC7du3C5XJht9sHtVu9ejU7duwgHA7T1tbG/v37qaioAKC0tJTXXnsNgL6+Pt566y3mzZs36mJFRGR8RbX66IEHHmD79u1UVFSwfft2qqurAdi0aRNHjhwBYO3atZSWlrJq1So2bNjA5s2bmTFjBgDf+973eO+991izZg3r1q1j1qxZbNiwYZyekoiIjJXJMIzEj8lEIVmHj7r9QQ4eaxr2PstcRWSmxmX6BpjcXeJ4Sfb6QDXGQrLXB0k4fCQiIlOHQkFERCIUCiIiEqFQEBGRCIWCiIhEKBRERCRCoSAiIhEKBRERiVAoiIhIhEJBREQiFAoiIhKhUBARkQiFgoiIRMRv684pzGQ20e0PDnk8NcWKVfEsIklAoRAH/kCIwydahjy+zFWENY5ba4uIDEWfT0VEJEKhICIiEQoFERGJUCiIiEiEQkFERCIUCiIiEqFQEBGRCIWCiIhEKBRERCRCoSAiIhHaWyEBevxBDtQ0cb67j15/iNcON/DNlfOZU5Kb6NJEZIpTTyHOevxB9r1zlobWbnIzbZQVZ9He4eeRp9/j6T3HCQRDiS5RRKYw9RTiaCAQunsDrFhaSrE9A4Dr5zj49Ttn2XfwLJ09Af5s7XWYzaYEVysiU5F6CnH07vFmunoGBwJAms3K798+j99fMZf3alvYvq8WwzASWKmITFXqKcRJp6+P0+5OXLPyBwXCxVYtn8l5Xx///XY9jtw07rh5VnyLFJEpTz2FOPmorg2TycS1s+zDtvvGrXNYtrCQ51+vo87dEafqRET6KRTioKO7j0/OdTC3NIeMtOE7ZyaTiT9avYDcLBs/3VmDv08TzyISPwqFOHj1/U8xDINF1wzfSxiQmZbCnXdcS3Obj2d+88k4Vyci8hmFwjgLhw3ePtpIWXE22Rm2qO/nKsunYvlMXnn/U06c9Y5jhSIin1EojLOmdh+9fSFmObNHfd+1t1yDIyeN/9hXSzAUHofqREQGiyoU6urq2LhxIxUVFWzcuJHTp09f1iYUClFdXU15eTkrV65kx44dg47v3r2bNWvWUFlZyZo1a2htbY3JE0h255q7sVpMOB2Zo75vaoqFqvJ5fNrSzf53z41DdSIig0W1JHXr1q1UVVWxdu1aXnjhBbZs2cLTTz89qM3OnTupr69n3759eL1e1q1bx80330xpaSlHjhzhscce4+c//zkFBQV0dnZis0U/lDKRnWvpYm5pHinWsXXKbpg3jSVzHLzwRh3LXYXYc9JiXKGIyGdGfKfyeDzU1NRQWVkJQGVlJTU1NbS1tQ1qt3v3btavX4/ZbMZut1NeXs6ePXsA+Pd//3f+9E//lIKCAgCys7NJTU2N9XNJOh3dfXT6Arhm5Y/5MUwmE1Ur5xMKGzz32qkYVicicrkRewput5uioiIsFgsAFouFwsJC3G43drt9ULuSkpLIbafTSWNjIwAnT56ktLSUb37zm/h8PlauXMmf/dmfYTJFv5WDw5EVddvxVlDw2fyA0eYjO+vKn95PNnQCsGR+IS3tPUM+XkZGKgVDnNA28P3W3DKb51/9hI0VC7kmio3zLq4xWSV7jcleH6jGWEj2+iC+NcbljOZQKERtbS1PPfUUfX193HXXXZSUlLBu3bqoH8Pj6SIcTvzWDwUF2bS0dEZu+/xBOrt6r9j25DkveVk2stOtnDp75TYAPp+fltDw5yPctsTJ3rdO8+RzR7hvw5JR1ZiMkr3GZK8PVGMsJHt9MPYazWbTmD5Mjzh85HQ6aWpqInThTSsUCtHc3IzT6bysXUNDQ+S22+2muLgYgJKSElavXo3NZiMrK4vbb7+dDz/8cNTFTiR9gRBN7T5KC2PTw8lKT+GOz5dx5JSHY2faY/KYIiKXGjEUHA4HLpeLXbt2AbBr1y5cLtegoSOA1atXs2PHDsLhMG1tbezfv5+Kigqgfx7ijTfewDAMAoEAb7/9NgsXLhyHp5M83B4fhgHTC0a/6mgo5UtLseek8qtXTmrDPBEZF1EtiXnggQfYvn07FRUVbN++nerqagA2bdrEkSNHAFi7di2lpaWsWrWKDRs2sHnzZmbMmAHAHXfcgcPh4Ctf+Qrr1q1j7ty5fOMb3xinp5QcWs/3YjaZmJYbu9VCKVYLlZ+fRZ27g5rT6i2ISOyZjAnykTNZ5xS6/UEOHmu6rN3eA/WEwgZfubmMJfMLOHyiZcjHXOYqIjM1uumdQDDM/f/3LQry0rn/m78TVY3JKNlrTPb6QDXGQrLXB0k4pyCjFzYMPB29Me0lDEixmvnyTTM5cdZLbb16CyISWwqFcXC+y08wZDAtL3ahEAz390q6/UGWugrJzkjhhTfqIl/r9gcJaicMEblKusjOOGj19i8/nZabHlV7k9lEtz84bJuwAe8d/2yYat6MPA7VtvDfb5+J9EiWuYqwRjkMJSJyJXoHGQct53uxpZjJzkiJqr0/EBp2zgFgyfyCQbfnl+by4SetHD/TzhcWO4e4l4jI6Gj4aBy0enuYlps+qjO2R8uWYmHu9FxOuzvw9Q7fyxARiZZCIcYCwTDnu/rGZZL5UgvL8gkb6HoLIhIzCoUY83T0YkBMJ5mHkpNpY3pBJifOegmFNcssIldPoRBjrd7+je/i0VOA/iu09faFOO1O7rXWIjIxKBRirPV8L1npKaTZ4jOH73RkkJtpo7ZeQ0gicvUUCjHm7fRjz4nftSJMJhPzSnNpPd9LQ2t33L6viExOCoUYCoXCdPoC5GXF9wJCs6fnYDaZeOtoY1y/r4hMPgqFGDrf3YcB5GbF91KjaTYrM4uyeOdYE4Hg8NdlEBEZjkIhhrxdfQDkx7mnADC3NBdfb5D3aoc/CU5EZDgKhRjydvkxmSA7M749BeifcHbkpPHa4YaRG4uIDEGhEEPeTj85mTYs5vE7k3koJpOJm68r5ni9l6Y2X9y/v4hMDgqFGPJ29cV9kvliy1yFALzy3tmE1SAiE5tCIUYCwTBdPQHy4jzJfDF7ThoLZuTx8rtndblOERkThUKMnO/un2ROZE8B4PPXFdPQ2s0pd0dC6xCRiUmhECPnu/xA4kPhxoWF2KxmfqtzFkRkDBQKMeLt8mM2maK+hsJ4SU+18j+uc/JOTRPBkDbJE5HRUSjEiLezj9wsG+YErDy61G03zqC7N8iRk55ElyIiE4xCIUa8Xf6ETjJf7HPzC8hKT+HAsaaRG4uIXEShEAOBYJju3mDC5xMGWCxmblxQwAeftOLv07YXIhI9hUIMdFxYeZSTgDOZh7LcVURfIMzhk62JLkVEJhCFQgx0+gZCIbGTzBebPyOP3Ewb7xxrTnQpIjKBKBRioMMXACArPXl6CmaziWULC/nwpIcefzDR5YjIBKFQiIFOXx/pqRZSrMn1ci53FREMhXn/Y+2cKiLRSa53sQmq0xcgOyN5egkDZk/PwZGTqiEkEYmaQiEGOn19CT9p7UrMJhPLFhbxUV0bXT2BRJcjIhOAQuEq+ftC9PhD5CRhTwFg+bWFhMIGh05oCElERqZQuEot53uAxFxYJxplRdkU5qXzjk5kE5EoKBSuUqv3Qigk4fAR9F98Z/m1hRw70x45n0JEZCgKhavU4u0FkjcUAJYvLMIw4N1aTTiLyPAUClepxdtDms2CzWpJdClDml6QScm0TK1CEpERWRNdwETX4u1J6l4CXBhCWljIC2/U8amne9g9mlJTrCTZ6RYiEkdR/fevq6tj48aNVFRUsHHjRk6fPn1Zm1AoRHV1NeXl5axcuZIdO3Zc1ubUqVMsWbKEbdu2XXXhyaLV25OU5yhcapmrEAN48fU6Dh5rGvKPP6Czn0WmsqhCYevWrVRVVbF3716qqqrYsmXLZW127txJfX09+/bt45lnnuHRRx/l3LlzkeOhUIitW7dSXl4eu+oTzB8I4e3qS6qN8IbidPQPIZ1u7Ex0KSKSxEYMBY/HQ01NDZWVlQBUVlZSU1NDW1vboHa7d+9m/fr1mM1m7HY75eXl7NmzJ3L8pz/9KV/60peYNWtWbJ9BArUk+cqjS31u/jRavD34enUim4hc2YhzCm63m6KiIiyW/olUi8VCYWEhbrcbu90+qF1JSUnkttPppLGx/zrBx48f54033uDpp5/mX/7lX8ZUqMORNab7jYeCgmwAPmnsAqDYkUV2VtqQ7VNSrFd1PNo2GRmpFNgzBtV4sc8vns5Lvz1DY3svS+ZdfvzSxxhvV6oxmSR7faAaYyHZ64P41jjuE82BQIAf/OAH/N3f/V0kWMbC4+kiHDZiWNnYFBRk09LSPwTzyZn+3pLZZNDZ1TvkfQKB4FUdj7aNz+enJRQaVOPFctKt5GXZqD3TzmznlX/JBh5jvA1VY7JI9vpANcZCstcHY6/RbDaN6cP0iKHgdDppamoiFAphsVgIhUI0NzfjdDova9fQ0MDixYuBz3oOLS0t1NfXc/fddwPQ0dGBYRh0dXXx0EMPjbrgZNLU7iMzzUpqSnIsRzWZTXT7gxhtPnxX2C47bMAsZw4ffNyKrzdARtrEGPYSkfgZMRQcDgcul4tdu3axdu1adu3ahcvlGjR0BLB69Wp27NjBqlWr8Hq97N+/n1/84heUlJRw4MCBSLtHH30Un8/Hd7/73dg/mzhrbu+hIC890WVE+AMhDp9oITsr7Yq9iiXzCygryuaDj1s509iFa1Z+AqoUkWQW1eqjBx54gO3bt1NRUcH27duprq4GYNOmTRw5cgSAtWvXUlpayqpVq9iwYQObN29mxowZ41d5Emhu9zEtiUIhGrlZNvKzU7UKSUSuKKo5hTlz5lzxvIMnn3wy8m+LxRIJi+Hce++9oygveQWCIdo6/Cy/dmKFAkBZcbaGkETkinTu6hi1eHsxgIK84VcEJaOyov5J5jMXVk+JiAxQKIxRc3v/OQrJNKcQLQ0hichQFApj1NzuAyZmKED/EFKLt4duncgmIhdRKIxRk7eHjFQrGWkTc0/BgSGkeg0hichFFApj1NzmozA/HZPJlOhSxkRDSCJyJQqFMWpq76Ewf2IOHQ3QEJKIXEqhMAbBUBhPRy9F+fHZI2i8aAhJRC6lUBiD1vO9GAYTvqegISQRuZRCYQwGVh5N9J4CaAhJRAZTKIxB04VzFCZ6TwE0hCQigykUxqC5vYc0m2XCXFxnOJ8NIXUkuhQRSQIKhTFoap/Yy1Ev1T+E1KshJBFRKIxFc3sPhZNgPmGAhpBEZIBCYZSCoTCe870UTYL5hAEaQhKRAQqFUWpp7yEUNibFJPPFBoaQ2jv9iS5FRBJIoTBK7tZuYHIsR73YrOL+IaTDH7cmuBIRSSSFwii5W/vH3SdbTyEns38I6dCJlkSXIiIJpFAYpQZPN7YUM7mZtkSXEnNlxdnUuTto67j8+s4iMjUoFEbJ3dpNYV7GpFmOerGBIaT3atVbEJmqFAqj1NDSPalWHl0sJ9PG9IJM3q5pSnQpIpIgCoVRCIcNmtq6KbRPzlAAWOYqos7dQcOFCXURmVoUCqPQ1tFLMGRMupVHF1u2sBCzycSbR9yJLkVEEkChMApN3gsb4U3Q6zJHIyfTxuI5Dn57tJFQOJzockQkzhQKo9A8iXZHHc7vXu/kfHcfR0+1JboUEYkzhcIoNLf7sFnN5GWnJrqUcbVkroOs9BTe0BCSyJSjUBiF5vYeiqdlYp6Ey1EvZrWYuXlRMR983EqHry/R5YhIHCkURqGpvQenIzPRZcTFF28oIRQ2ePWDhkSXIiJxpFCIUtgwaG7voaQgK9GlxMX0aZksusbObw6dIxjShLPIVKFQiJK3008wFMY5bWr0FABW3liKt6uPd483J7oUEYkThUKUBq7L7HRM3nMULnXdbAdF9gx+/e5ZDMNIdDkiEgcKhSg1tvkAmF6QneBK4sdsMrHyxlLq3J2cbNAFeESmAoVClNyeblJTLDhy0xJdSlx9/rpiMtOsvPTb04kuRUTiQKEQpUaPj2J7Bmbz5F6Oeqk0m5XVN83k8EkPH5/zJrocERlnCoUouT2+KTWfcLHypTPIybTx7KunNLcgMskpFKLgD4TwdPRSPEVDIdVmYc3nZ3HirJejddr6QmQyiyoU6urq2LhxIxUVFWzcuJHTp09f1iYUClFdXU15eTkrV65kx44dkWOPP/44d9xxB1/96lf52te+xuuvvx6zJxAPTRcmmafKiWtXcusNJUzLTeNXr5zURnkik1hUobB161aqqqrYu3cvVVVVbNmy5bI2O3fupL6+nn379vHMM8/w6KOPcu7cOQAWL17Mr371K1588UUeeeQR7rvvPnp7J84lH92eC6Fgn5o9Bejf+mL9bXM529zF3nfOJrocERknI4aCx+OhpqaGyspKACorK6mpqaGtbfAwwu7du1m/fj1msxm73U55eTl79uwB4JZbbiE9vX9n0QULFmAYBl7vxJm0dHu6MQFFk/jiOtG4cUEBSxcU8Pzrp/hUF+ERmZSsIzVwu90UFRVhsVgAsFgsFBYW4na7sdvtg9qVlJREbjudThobGy97vOeff56ZM2dSXFw8qkIdjsRtL9HW1UeRI4MSZx4ABRedq2C0+cjOGn6ZakqKddg2Ix0fS5srtY3mMTIyUikYpkf07T/4HTb/6Df8x75afvS/bsFiGfu0VEGSn/OR7PWBaoyFZK8P4lvjiKEQS++88w4//vGP+dnPfjbq+3o8XYTDiVn5crqhg8K8dFpaOikoyKalpTNyzOcP0tk1/FBYIDB8m5GOj7ZNdlbaFdtG8xg+n5+WUGjYNn9QPpefvljDE88eZsNtc4dtO5RLX8dkk+z1gWqMhWSvD8Zeo9lsGtOH6RE/5jmdTpqamghdeKMIhUI0NzfjdDova9fQ8NmOmm63e1Bv4P333+ev/uqvePzxx5k9e/aoC02UcNigqb3/HAXpd5OriC99bjp7DtTz2mHtoioymYwYCg6HA5fLxa5duwDYtWsXLpdr0NARwOrVq9mxYwfhcJi2tjb2799PRUUFAB9++CH33XcfP/nJT1i0aNE4PI3x4+noJRAMT9lzFK7EZDJRVT6PRdfY+Y+9tdSc1jJVkckiqgHhBx54gO3bt1NRUcH27duprq4GYNOmTRw5cgSAtWvXUlpayqpVq9iwYQObN29mxowZAFRXV9Pb28uWLVtYu3Yta9eupba2dpyeUmxFVh5NkeWoJrOJbn9wyD99Iej2B/EHw/zxlxdSkJfOT371Ie+daIm0CWrFqsiEFdWcwpw5cwaddzDgySefjPzbYrFEwuJSzz777BjLS7xGT/8qm6ly4po/EOLwiZYhjy+ZXzDo+C1LnOx/9xz/57mjfPEGJzOLslnmKsKaGtfpKhGJEZ3RPAJ3m4/MNCvZ6SmJLiUppadaWbV8BvacVF59v4EjJz2EtRWGyISlUBjBuZYuSqZlYprk12W+GqkpFlYum0GZM5v3P27lyRc+ors3kOiyRGQMFArDCIcNzjV3M7Mo+dcxJ1qK1cwti50sdxVy7Ew71U8d5Exjci/1E5HLKRSG0dTuwx8IMbNoalyX+WqZTCYWluXz7Q1LCBsGP/yP93j1g0/HtLNqMMywE96azBYZH5oNHEZ9UxcAZeopjMo1zhy2/s9l/HRnDT/fU8uxM+38UcVCMtKi/3XzB4IcPNY05HFNZouMD/UUhlHf3InFbKJk2tRYjhorJrMJs8XM3WsXseZ3Z/Hu8Wa2/uwdPjrTpk/5IklOoTCM+qYupk/LxHoV+/tMRf5AiIPHmnjveDP52amsWj6T3r4g//jLD3hq9zF6+jQJLZKs9G43BMMwqG/q1CRzDBTmp1P5u7OYUZjFodoWnnjuKN5Of6LLEpErUCgMwdvVR6cvwAxNMsdEaoqFW28o4aZri/j4nJc//9+/4SNtjyGSdBQKQ6hv6l9OqUnm2DGZTCyYmcdfVf0OWRkp/OMvP+DZV08SDGmSQSRZKBSGMBAKMwrVU4i1kmmZ/OO3b+WWJU5eeusM235xiKZ2X6LLEhEUCkOqb+6iMD+ddC17HBdpqVb+55dd/D9rF9Hg6eYH/3qAZ189ib9v+Gs5iMj40jveEOqbOjV0FAfLXUXMK83jV698wktvneGNI26+vHwmN15blOjSRKYkhcIV+HoDtHh7uWVxyciN5arlZ6eyac0ivvS56Tz32il++fIn7HrrDNc4s5k/M4/MNG1GKBIvCoUrqD3rBWBeaW6CK5mcTGYTzW0+fP7goK+XFGSx+euLOdVwnl8fPMeRUx6O1rUxszCLBWX5FOWna2NCkXGmULiC42e8pFjNzC5RKIwHfyDEsTPNw14v+k8qXbzxwaecOHuej895OdPURV6WjcVzp1GmZcIi40ahcAXH69uZOz2XFKvm4RMpO8PG0gUFLJnr4LS7k4/q2njtgwbysmxkZ9q4cX5hoksUmXT0rneJrp4AZ5u7WDAzL9GlyAVWi5m5pbms+cIsvrDYSShs8C//dZQnXjjK+S6dGS0SS+opXKK2vn8+YeHM/ARXIpcym0zMLsmhrDiLto4+fn2wno/q2rjzjmu5Yd60RJcnMimop3CJ4/Xt2FLMzC7JSXQpMgSL2cxXbi6j+k+X48hN4yfPfsh/7v+YgLZfFblqCoVLHK9vZ970XO2MOgE4HZl8/1s3cvvSUn797lke2f6ezowWuUp657tIh6+PT1u6WVimoaOJIsVq5psr53Pv166n1dtD9VMHebumMdFliUxYCoWLaD5h4vrc/AIe+JPllBZm8dMXa/jZ7mPaMkNkDBQKFzl0ooXMNCtlxdreYiJy5Kbx3arPUfn5Mt780M2DPz9IbX17ossSmVAUChf4eoMcOtHCcleR5hMmAJPZRLc/eNmf3kCYipvK2Pz16+kLhNn2/77Pkzs/wnN+6BPlROQzWpJ6wbu1zQSCYT5/fXGiS5Eo+AMhDp9oGbbN9/5oKb859Cl7DpzhnWPN3HRtEauWzWBGYZa2yxAZgkLhgt8ebaTYnsFsp5aiTha2FAtf++Jsbl1Swt6D9bx2uIHfHm3E6chg2cJCFl1j5xpnjnqGIhdRKAAt3h5OnPXytS/O1ifISciRm0ZV+Xy++rvXcPB4MwePNbHzzdO8+OZpbClmrinOYWZRNjOLsigrysY5LSPRJYskjEIBeOtoIybg5kUaOprMstJTuO1z07ntc9Pp6glQW9/O8TNe6ho7ePWDT+m7cPKb1WJmekEmjpw0ivLTKbJnUJSfjiMnjbzs1EE9i2AY/IHgUN8SgNQUK9pGSyaKKR8KgWCI1z9sYGFZPo7ctESXIzE0MBk91LHr5xaycJYdgFDYoLndx7nmLs61dOPp6OXT1i4Of9JKKGx8dj8gLzsVe04q9uw0cjJtdPj6yEyzkpmeQk6G7bKNFJe5irDG4Qp+IwWUwkmiMeVDYc+Bejwdfv7kK65ElyIxNtJk9JL5BVc8XlqQiesaB51dvSxdUEivP0hzew+ejl7aOnpp6/Dj6eilvqkTT0cvwZAx6P752akU5KVTWpiJ0xG/oSh/IMjBY01DHo9XOMnENqV/Qzzne3nprTPcuKCAay98YhS5mMVqJsOcwqz0FGZdYT+sUNjgzQ8b6O4N0t0TwNvlp7m9h1MN5zlx1kuKxcyxM16WLyy9URGKAAALFUlEQVTk+tkOXfNbkt6U/g39/37zCQAbVsxNcCWSrKLpbaSnWklPtTItN40y+k98DIXDuD0+6pu6OFHv5VBtCylWM9fPdnDjwgIWz3aQMYbLjAaCIRrbenB7uvG+9yknz7ZzvruPHn8QfyBEIBjGYjZFhrNyM23Yc9PIz0od82sgU8uUDYU3j7g5eLyZdV+4hmm56YkuRyYZi9lMaUEWpQVZLF1QSENLF+/WtvBubTOHTrREtgFfMDOPGYVZlDgyycpIIT3VSjhs0NsX4ny3n4bWbhpafbg93TS0dtPs7cG4MFplNsG0vHTyslJx5KRhMpto8fYQDIVp7+rjbEs34QvzIRaziYPHm7l+tgNXWT5lRdmYzVppJ5ebkqHw+uEG/v2/j+Mqy+fL/2NmosuRSc5sNrFgZj4LZubzB+XzOPnpeY6cauOjOg97DtQPmsi+EovZRJE9gxmFWSx3FVEyLZOSaZlcN7+Q897PdoXt9g+eUzAMg05fAE9HL63eXrxdfn71ykkAMtOsLJyZz7Wz8rn2GjuFebr+tfSLKhTq6uq4//778Xq95OXlsW3bNmbNmjWoTSgU4uGHH+b111/HZDJx9913s379+hGPxVNvX5A9B+p58c3TXHeNnf/1tetJsVriXodMLZeugiopyKKkIIuKm2YSCIZpavfRdt5Pb1+QHn8Qs8lEeqqFrPQUnI5MCvPTwWS+bGWRt9OP76LHvTRbTCYTOZk2cjJtXOPMYfmiYs53+jlx1kttfTu19V7euzA0Zs9JZcHMfBaW5TNvei5F+QoJmJpLjqMKha1bt1JVVcXatWt54YUX2LJlC08//fSgNjt37qS+vp59+/bh9XpZt24dN998M6WlpcMeG2/hsMG5li6OnPKw7+BZOn0BlrsKufMOlwJB4iKaLTmWuYrIHGYS+tJeAEB2VhqdXZ/t6bRkfsGIdQxsEDh/Rh7zSnPp9AVo8HTT6PHx7vFm3jrav+14Rqr1Qo8kA0duOvlZqWRnpJCaYiElxUw4bBAIhgmGwgSCBoFQiF5/iJ4LwdbjD9HrDxICznf2XzLVZOpf0msymTCbTWSkWslIu/AnNYXMNCtZ6SlkpqeQlZ5CVkb/1yzm+L7j+gMhznf58Xb10ezt4chJz4XnFMR34W9/IEQobBAOG9isFlKsZtJSreRmpESCODfTRm5W6oW/beRmppKTmRL35zNaI4aCx+OhpqaGp556CoDKykoeeugh2trasNs/W7Gze/du1q9fj9lsxm63U15ezp49e7jrrruGPRatsYx/1jd18rOXjkU+TS2e42DV8pmUFV3dLqgX12K1mEecMBypTSwe4+I26alWQsHL2yZLrVaLecga413HUMcH6ovbzy7Fgn+YK8eZzZc/xqWv4VhqzUy3UezIBCBsGJRMy6S5rf98jSZvD/XN3Ry/sKV8tEym/k/PqTYL6alWMtJSMJnAMMDAAEwX5j38uNt89PYFI/MkV5JuGwiPFDLSrNisZqxWMxaziRSLGYvFjNViwmoxYb6od3PpQ4bCBqGQQTAc7v87FMagP7R8/iA9vf1v+v7A5Vuum0yQZrOSk2mjID+DVJsFi6n/ErH23DQwoLcvRFdPH109AVobOgb14i5+nMx0GznpF55LSn+g2KxmbCkWMIFxIWxCYQOLxcw3bp8/pve/sc4ZjRgKbreboqIiLJb+T9UWi4XCwkLcbvegUHC73ZSUlERuO51OGhsbRzwWrfz8zFG1B3A4snj0Wueo7xfN416s1Jk74n1mlw5/jYaRjseqjR4jOR8jGjNL4vN7JlNbcvdjREQkrkYMBafTSVNTE6FQf5cqFArR3NyM0+m8rF1DQ0Pkttvtpri4eMRjIiKSPEYMBYfDgcvlYteuXQDs2rULl8s1aOgIYPXq1ezYsYNwOExbWxv79++noqJixGMiIpI8TIYx3BRPv5MnT3L//ffT0dFBTk4O27ZtY/bs2WzatIk///M/5/rrrycUCvHggw/y5ptvArBp0yY2btwIMOwxERFJHlGFgoiITA2aaBYRkQiFgoiIRCgUREQkQqEgIiIRU3KX1LGKZmPAeNq2bRt79+7l008/ZefOncyfPz/p6mxvb+ev//qvqa+vx2azUVZWxoMPPojdbueDDz5gy5Yt+P1+pk+fzj/8wz/gcDjiXuM999zDuXPn+reSyMjgBz/4AS6XK6lexwGPPfYYjz76aOTnnSyvIcCKFSuw2WykpvZfu+E73/kOt9xyS9LU6Pf7eeSRR3jrrbdITU3lhhtu4KGHHkqan/O5c+fYvHlz5HZnZyddXV2888478a3RkKh961vfMp5//nnDMAzj+eefN771rW8ltJ6DBw8aDQ0Nxm233WbU1tZGvp5Mdba3txtvv/125Pbf//3fG3/zN39jhMNho7y83Dh48KBhGIbx+OOPG/fff39Cauzo6Ij8+9e//rWxbt06wzCS63U0DMM4evSoceeddxpf+tKXjNra2qR6DQ3DuOz30DCMpKrxoYceMn74wx8a4XDYMAzDaGlpMQwj+X7OAx5++GGjurraMIz41qhQiFJra6uxdOlSIxgMGoZhGMFg0Fi6dKnh8XgSXNng/4zJXKdhGMaePXuMP/7jPzYOHz5s3HHHHZGvezwe44YbbkhgZf2ee+454/d+7/eS7nX0+/3Ghg0bjPr6+sjPO9lewyuFQrLU2NXVZSxdutTo6uoa9PVk+zkP8Pv9xk033WQcPXo07jVq+ChK0W4MmGjJXGc4HOY///M/WbFixWWbJNrtdsLhcKR7HG/f//73efPNNzEMg3/9139Nutfxxz/+MV/96leZMWNG5GvJ9hpC/5CRYRgsXbqUv/zLv0yaGs+ePUteXh6PPfYYBw4cIDMzk29/+9ukpaUl1c95wMsvv0xRURGLFi3i6NGjca1RE80SNw899BAZGRn84R/+YaJLucwPf/hDXnnlFe677z5+9KMfJbqcQd5//32OHDlCVVVVoksZ1i9+8QtefPFFnn32WQzD4MEHH0x0SRHBYJCzZ89y7bXX8l//9V985zvf4d5778Xn84185wR49tln+frXv56Q761QiFK0GwMmWrLWuW3bNs6cOcM///M/YzabL9sksa2tDZPJlLBPuAPWrVvHgQMHKC4uTprX8eDBg5w6dYrbb7+dFStW0NjYyJ133smZM2eS6jUceG1sNhtVVVUcOnQoaX7OJSUlWK1WKisrAViyZAn5+fmkpaUlzc95QFNTEwcPHmTNmjVA/P9PKxSiFO3GgImWjHX+0z/9E0ePHuXxxx/HZrMBcN1119Hb28u7774LwC9/+Uu+/OUvx7227u5u3G535PbLL79Mbm5uUr2Od999N2+88QYvv/wyL7/8MsXFxfzbv/0bd911V1K8hgA+n4/Ozk6g/9rQu3fvxuVyJc3P2W63c9NNN0X2X6urq8Pj8TBr1qyk+TkPeO6557j11lvJz++/9kW8fxe199EoDLUxYKI8/PDD7Nu3j9bWVvLz88nLy+Oll15Kqjo//vhjKisrmTVrFmlpaQCUlpby+OOPc+jQIbZu3TpoqeK0adPiWl9rayv33HMPPT09mM1mcnNz+e53v8uiRYuS6nW82IoVK3jiiSeYP39+UryG0D9mf++99xIKhQiHw8yZM4e//du/pbCwMKlq/N73vofX68VqtfIXf/EX3HrrrUn3c66oqOD73/8+X/ziFyNfi2eNCgUREYnQ8JGIiEQoFEREJEKhICIiEQoFERGJUCiIiEiEQkFERCIUCiIiEqFQEBGRiP8fkJS81SxKyvcAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def plot_distribution(counter):\n",
    "    '''\n",
    "    Plot the distribution stored in the counter object\n",
    "    '''\n",
    "    # prepare data\n",
    "    import numpy as np\n",
    "    labels, values = zip(*counter.items())\n",
    "    indexes = np.arange(len(labels))\n",
    "\n",
    "    # generate a plot\n",
    "    import seaborn as sns\n",
    "    sns.set(color_codes=True)\n",
    "    sns.distplot(values)\n",
    "    plt.show()\n",
    "\n",
    "plot_distribution(p_distribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write questions into file to generate embeddings\n",
    "samples = mongo.get_sample(train=True, limit=limit)\n",
    "os.chdir(\"/home/zola/Projects/temp/KBQA/data/lcquad\")\n",
    "with open('questions.txt', 'w') as fout:\n",
    "    for doc in samples:\n",
    "        fout.write(doc['question']+'\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "999 documents annotated with entities and predicates URIs across hops\n",
      "SELECT DISTINCT ?uri WHERE {?uri <http://dbpedia.org/property/music> <http://dbpedia.org/resource/Akira_Ifukube>  . ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Film>}\n",
      "['http://dbpedia.org/ontology/Film']\n",
      "[2979987]\n"
     ]
    }
   ],
   "source": [
    "# annotate types separately\n",
    "loaded = True\n",
    "\n",
    "verbose = False\n",
    "limit = None\n",
    "\n",
    "if not loaded:\n",
    "    samples = mongo.get_sample(train=False, limit=limit)\n",
    "    count = 0\n",
    "    for doc in samples:\n",
    "        # fix URI !\n",
    "        sparql_query = doc['sparql_query']\n",
    "        # parse the SPARQL query into spo triples\n",
    "        tripples = sparql_query[sparql_query.find(\"{\")+1:sparql_query.find(\"}\")].split('. ')\n",
    "\n",
    "        # collect entities and predicates separately for 2 hops\n",
    "        classes = []\n",
    "        cids = []\n",
    "        for tripple in tripples:\n",
    "            if tripple:\n",
    "                entities = []\n",
    "                s, p, o = tripple.strip().split()\n",
    "#                     \n",
    "                p = p[1:-1]\n",
    "                if p == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type':\n",
    "                    if o[0] != '?':\n",
    "                        uri = o[1:-1]\n",
    "                        classes.append(uri)\n",
    "                        try:\n",
    "                            cids.append(e_index.look_up_by_uri(uri)[0]['_source']['id'])\n",
    "                        except:\n",
    "                            print(\"%s not found in the entity catalog\"%uri)\n",
    "               \n",
    "        if classes and verbose:\n",
    "            print(sparql_query)\n",
    "            print(classes)\n",
    "            print(cids)\n",
    "       \n",
    "        doc['classes'] = classes\n",
    "        doc['classes_ids'] = cids\n",
    "\n",
    "        mongo.col.update_one({'_id': doc['_id']}, {\"$set\": doc}, upsert=True)\n",
    "        count +=1\n",
    "\n",
    "    print(\"%d documents annotated with entities and predicates URIs across hops\"%count)\n",
    "\n",
    "# show sample annotation\n",
    "sample = mongo.get_by_id(\"2652\")\n",
    "print(sample['sparql_query'])\n",
    "print(sample['classes'])\n",
    "print(sample['classes_ids'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## QALD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "215 docs\n",
      "{'SerialNumber': '0',\n",
      " '_id': ObjectId('5c5dddac06669524ac939efa'),\n",
      " 'answers': ['1863-07-03'],\n",
      " 'question': 'When was the Battle of Gettysburg?',\n",
      " 'sparql_query': 'PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX res: '\n",
      "                 '<http://dbpedia.org/resource/> SELECT DISTINCT ?date  WHERE '\n",
      "                 '{         res:Battle_of_Gettysburg dbo:date ?date . }',\n",
      " 'train': False}\n",
      "PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX res: <http://dbpedia.org/resource/> SELECT DISTINCT ?date  WHERE {         res:Battle_of_Gettysburg dbo:date ?date . }\n"
     ]
    }
   ],
   "source": [
    "mongo = Mongo_Connector('kbqa', 'qald')\n",
    "\n",
    "ENDPOINT = 'http://wikidata.communidata.at/dbpedia/query'\n",
    "\n",
    "# load LC-QUAD dataset\n",
    "loaded = True\n",
    "\n",
    "if not loaded:\n",
    "    import os\n",
    "    os.chdir(\"/home/zola/Projects/temp/KBQA/data/qald\")\n",
    "\n",
    "    import json\n",
    "    import pprint\n",
    "    data_path = \"qald-7-train-multilingual.json\"\n",
    "    with open(data_path) as f:\n",
    "        dataset = json.load(f)\n",
    "        for q in dataset['questions']:\n",
    "            doc = {}\n",
    "            doc['SerialNumber'] = q['id']\n",
    "            doc['question'] = q['question'][0]['string']\n",
    "            doc['sparql_query'] = q['query']['sparql'].replace('\\n', ' ')\n",
    "            response = requests.get(ENDPOINT, params={'query': doc['sparql_query'], 'output': 'json'}).json()\n",
    "            if 'results' in response:\n",
    "                results = response['results']['bindings']\n",
    "                doc['answers'] = [v['value'] for r in results for v in r.values()]\n",
    "            elif 'boolean' in response:\n",
    "                doc['bool_answer'] = response['boolean']\n",
    "            doc['train'] = False\n",
    "            mongo.col.insert_one(doc)\n",
    "        \n",
    "mongo.count_all_docs()\n",
    "doc = mongo.get_sample(train=False, limit=1)[0]\n",
    "pprint.pprint(doc)\n",
    "print(doc['sparql_query'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX res: <http://dbpedia.org/resource/> SELECT DISTINCT ?date  WHERE {         res:Battle_of_Gettysburg dbo:date ?date . }\n",
      "['res:Battle_of_Gettysburg']\n",
      "['dbo:date']\n"
     ]
    }
   ],
   "source": [
    "# parse the SPARQL query into spo triples\n",
    "cursor = mongo.get_sample(train=False, limit=1)\n",
    "with cursor:\n",
    "    for doc in cursor:\n",
    "        sparql_query = doc['sparql_query']\n",
    "        print(sparql_query)\n",
    "        tripples = sparql_query[sparql_query.find(\"{\")+1:sparql_query.find(\"}\")].split('. ')\n",
    "\n",
    "        # collect entities and predicates\n",
    "        for tripple in tripples:\n",
    "            if tripple:\n",
    "                entities, predicates = [], []\n",
    "                clause = tripple.strip().split()\n",
    "                if clause[0] != 'filter':\n",
    "                    s, p, o = clause\n",
    "                    if s[0] != '?':\n",
    "                        entities.append(s)\n",
    "                    if o[0] != '?':\n",
    "                        entities.append(o)\n",
    "                    predicates.append(p)\n",
    "                print(entities)\n",
    "                print(predicates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tf36",
   "language": "python",
   "name": "tf36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
